** Rita QC code for UKB
** Last edit 16.06.2020
** Edit: change to the right UKB ids in the quality control data 
** Rita code for removing people with withrawn concent 
** Adapted to Sibs project

clear all

//Load qc sample from UKB into stata
import delimited "Analysis\Input\ukb_sqc_v2_VU_norfaceID.txt", delimiter(space, collapse) 
*(26 vars, 487,398 obs)

//save as stata data set
//the data set with old key file is in quality_control_stata.dta, starting with sample_1
save "\Analysis\Input\quality_control_stata_v2.dta", replace

//Load FAM file from 
import delimited "Analysis\Input\ukb_hm3_chr1.fam", delimiter(space, collapse) clear

keep v1 v2
rename v2 id_ukb
rename v1 id_ukb_fam

save "Analysis\Input\ukb_hm3_chr1.dta", replace

//Load PC's - these data are with Norface ids directly 
import delimited "Analysis\Input\ukb_sqc_v2_PCs.txt", delimiter(space, collapse) clear

//save PC's as a stata data set
keep id_norface pc1
save "Analysis\Input\ukb_sqc_v2_PC1.dta", replace

//Load UKB data set 
use "Analysis\Input\ExtractedData.dta", clear
keep ID c_ethnicbackground
//retrieved from ukb23283 do file
label define c_ethnicbackground 4001 "Caribbean" 3001 "Indian" 1 "White" 2001 "White and Black Caribbean" 1001 "British" 3002 "Pakistani" 2 "Mixed" 4002 "African" 1002 "Irish" 2002 "White and Black African" 3003 "Bangladeshi" 3 "Asian or Asian British" 4003 "Any other Black background" 1003 "Any other white background" 2003 "White and Asian" 3004 "Any other Asian background" 4 "Black or Black British" 2004 "Any other mixed background" 5 "Chinese" 6 "Other ethnic group" -1 "Do not know" -3 "Prefer not to answer"
tab c_ethnicbackground
label list c_ethnicbackground

rename ID id_norface
merge 1:1 id_norface using "Analysis\Input\quality_control_stata_v2.dta"

/*
  Result                           # of obs.
    -----------------------------------------
    not matched                        15,311
        from master                    15,210  (_merge==1)
        from using                        101  (_merge==2)

    matched                           487,297  (_merge==3)
    -----------------------------------------

*/

rename _merge _merge1
merge 1:1 id_norface using "Analysis\Input\ukb_sqc_v2_PC1.dta"

/*
  Result                           # of obs.
    -----------------------------------------
    not matched                        14,265
        from master                    14,248  (_merge==1)
        from using                         17  (_merge==2)

    matched                           488,360  (_merge==3)
    -----------------------------------------

*/

save "Analysis\Input\complete_sample_qc_info_ukb_v2.dta", replace
use "Analysis\Input\complete_sample_qc_info_ukb_v2.dta", clear

gen flag = 0

//flag individuals with bad genotype data (There is no 1)
replace flag = 1 if hetmissingoutliers ==1
*(0 real changes made)
* the same 

/*flag individuals with undefined sex - no sex info in the new qc file, in the extracted data there are no people with missing sex
-> skipping this step 
replace flag = 1 if sex == 0
*(3 real changes made)
* the same */

// flag putative.sex.chromosome.aneuploidy
replace flag = 1 if putativesexchromosomeaneuploidy == 1
*(652 real changes made)
* the same 

//flag individuals with misreported sex
replace flag = 1 if submittedgender != inferredgender
*(192 changes made)
* the same 

//Flag non whites (self reported)
gen self_reported_white = 0
replace self_reported_white =1 if c_ethnicbackground == 1 | c_ethnicbackground == 1001 | c_ethnicbackground == 1002  | c_ethnicbackground == 1003 
replace flag = 1 if self_reported_white == 0
*(29,861 real changes made)
* Dilnoza -(29,882 real changes made)


//Flag individuals with PC1> 0 (non europeans)
replace flag = 1 if pc1 > 0 
*(24,798 real changes made)
* Dilnoza - (24,734 real changes made)

tab flag
/*       flag |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |    447,180       88.96       88.96
          1 |     55,506       11.04      100.00
------------+-----------------------------------
      Total |    502,686      100.00
	  
	  
Dilnoza 

       flag |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |    447,165       88.97       88.97
          1 |     55,460       11.03      100.00
------------+-----------------------------------
      Total |    502,625      100.00

*/


gen missing_info = 0 
replace missing_info =1 if _merge1 != 3 /* Quality control stata */
replace missing_info =1 if _merge != 3 /*PC1 matching */

tab flag missing_info


/*             |     missing_info
      flag |         0          1 |     Total
-----------+----------------------+----------
         0 |   458,451     13,527 |   471,978 
         1 |    28,869      1,825 |    30,694 
-----------+----------------------+----------
     Total |   487,320     15,352 |   502,672 
*/

//All the samples with missing info and no flag have no information on sex, heterozygosity etc.I will mark them as flagged too. 

/* Dilnoza's tabulation - much fewer people with missing info(1) and not flagged (0) = 836 compared to 13,527 in Rita's case

           |     missing_info
      flag |         0          1 |     Total
-----------+----------------------+----------
         0 |   446,329        836 |   447,165 
         1 |    40,968     14,492 |    55,460 
-----------+----------------------+----------
     Total |   487,297     15,328 |   502,625 
	 

*/
replace flag = 1 if missing_info == 1
* (836 real changes made)

drop if flag == 0
*(446,329 observations deleted)

rename _merge _mmerge

//merge with original fam fie to have fam and individual id
duplicates tag id_ukb, gen(dup_id_ukb) 
drop if dup_id_ukb == 15226 //missing id's, id == . 
*(15,227 observations deleted)


merge 1:1 id_ukb using "Analysis\Input\ukb_hm3_chr1.dta", gen(_mergeHM3)
/*


    Result                           # of obs.
    -----------------------------------------
    not matched                       446,340
        from master                         0  (_mergeHM3==1)
        from using                    446,340  (_mergeHM3==2)

    matched                            41,069  (_mergeHM3==3)
    -----------------------------------------

	*/

drop if _mergeHM3!=3
*(446,340 observations deleted) why do we do this? 

order id_ukb_fam id_ukb
sort id_ukb_fam

keep id_ukb_fam id_ukb

count 

export delimited using "Analysis\Input\list_samples_qc_non_consent_UKBB_v2.txt", delimiter(tab) novarnames replace


